1 Preparation
1.1 Packages
The tidyverse is the main workhorse of my R packages. Almost everything vital stems from it’s ecosystem and supplements the suite.
#options(ggplot2.continuous.color = "viridis")
#options(ggplot2.continuous.fill = "viridis")
#options(ggplot2.discrete.fill = "viridis")
options(scipen = 999)
setwd("~/Desktop/mfour_workshop/Reference")
#packages ----
#Workhorse
library(tidyverse)
#Import & Export
library(readxl)
library(writexl)
library(readr)
#Formatting & Visualization
library(tidyquant)
library(hrbrthemes)
library(kableExtra)
library(viridisLite)
library(scales)
library(DT)
library(ggrepel)
library(bbplot)
1.2 Reading the Data
Ingesting the data from a .csv that I originally built in Google Drive which could lend it’s self to automated, on-going reporting.
import_dat <- readr::read_csv("Arcadian Gardens Inventory Analysis.csv")
## Rows: 73 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Scientific Name, Common Name, Maturity
## dbl (2): id, Watering Preference
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1.3 Glimpse of the Data
Using glimpse() to take a peak at the data with an easy-to-read summary.
import_dat %>% glimpse()
## Rows: 73
## Columns: 5
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ `Scientific Name` <chr> "Ficus benghalensis", "Musa acuminata", "Alocasi…
## $ `Common Name` <chr> "Ficus Audrey", "Dwarf Banana Tree", "Elephant E…
## $ Maturity <chr> "Adult", "Adult", "Juvenile", "Juvenile", "Adut"…
## $ `Watering Preference` <dbl> 7, 7, 7, 7, 7, 10, 4, 4, 7, 10, 10, 7, 4, 7, 4, …
2 Data Manipulation
2.1 Renaming the Columns
Altering the names of the columns to make them more manageable in R. We can always create prettier labels later on.
dat <- import_dat %>%
rename( #renaming the columns
id = id,
name_scientific = `Scientific Name`,
name_common = `Common Name`,
maturity = Maturity,
water = `Watering Preference`)
2.2 Fixing a Typo
Using a quick summary, we can see that one of the “adults” is missing a letter. Then, we account for this via case_when() to patch the data. This fixes it for any downstream inquiries into Maturity.
2.2.1 Spotting the Typo
dat %>% group_by(maturity) %>% summarise(n = n()) %>%
kbl(align = "l", format.args = list(big.mark = ",")) %>%
kable_styling(
full_width = F,
bootstrap_options = c("hover", "responsive", "striped"))
| maturity | n |
|---|---|
| Adult | 14 |
| Adut | 1 |
| Baby | 21 |
| Juvenile | 28 |
| Mature | 9 |
2.2.2 Fix with Case When
dat <- dat %>%
mutate( #tidying text and a typo in the data
name_scientific = name_scientific %>% str_to_title(),
maturity = case_when(maturity == "Adut" ~ "Adult",
T ~ maturity),
name_scientific = case_when(name_scientific == "Saintpaulia" ~ "Saintpaulia Incognita",
T ~ name_scientific))
2.3 Deriving Genus x Species
We are given the Genus and Species via the Scientific Name, the trick is extracting them. First, I use separate() to break out the genus since they are consistently one word in my collectiona and followed by a space.
The species were a little trickier due to having a variety of patterns that broke separate(), such as 2+ word species, apostrophization, hybrids & cultivars. So for the second part, I simply subtracted the new genus from the existing Scientific name via str_remove() to derive the species.
2.3.1 Engineering Genus
dat <- dat %>%
separate(col = name_scientific,
into = c("genus"),
sep = " ",
remove = FALSE) %>% #pulling the genus out of the scientific genus species
select(genus, contains("name"), water, maturity, id) %>%
mutate(genus = genus %>% as_factor(),
name_common = name_common %>% str_to_title())
## Warning: Expected 1 pieces. Additional pieces discarded in 73 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
dat %>% distinct(genus) %>% pull()
## [1] Ficus Musa Alocasia Pilea Maranta
## [6] Anthurium Monstera Begonia Epiprenem Peperomia
## [11] Philodendron Saintpaulia Scindapsus Ludisia Ceropegia
## [16] Rhaphidophora Macodes Calathea Oxalis Zamioculcas
## [21] Aeschynanthus Adenium
## 22 Levels: Ficus Musa Alocasia Pilea Maranta Anthurium Monstera ... Adenium
2.3.2 Engineering Species
dat <- dat %>%
mutate(species = str_remove(as.character(name_scientific),
as.character(genus)) %>%
trimws("both")) %>% #using genus to derive species from scientific name
select(genus, species, contains("name"), maturity, water, id)
dat %>% distinct(species) %>% pull()
## [1] "Benghalensis" "Acuminata"
## [3] "Odora Variegata" "Elastica 'Tineke'"
## [5] "Lyrata" "Peperomiodes"
## [7] "Leuconeura 'Kerchoviana'" "Regale"
## [9] "Deliciosa 'Albo Variegata'" "'Lois Burke'"
## [11] "Maculata" "Aurem"
## [13] "Leuconeura 'Red'" "Elastica 'Ruby'"
## [15] "Leuconeura 'Lemon Lime'" "Obtusifolia"
## [17] "Verrucosum 'Mini'" "Incognita"
## [19] "Adansonii" "'El Choco'"
## [21] "Pictus 'Argyraeus'" "Clarinervium"
## [23] "Dorayaki" "Crystallinum"
## [25] "Regale+Magnificum" "Magnificum"
## [27] "Discolor" "Erubescens"
## [29] "Woodii Variegata" "Pictus 'Silver Lady'"
## [31] "Prostrata" "Micans"
## [33] "Tetrasperma" "Petola"
## [35] "Cadieri" "Makoyana"
## [37] "Verrucosum X Melanochrysum" "Ornata"
## [39] "Regnelli" "Deliciosa"
## [41] "'Regal Shields'" "Elastica 'Burgundy'"
## [43] "Woodii" "Zamiifoli"
## [45] "Longicaulis" "‘Swan And Dragon’"
## [47] "Fibrous 'My Special Angel'" "Fibrous 'Whimsy'"
## [49] "Verrucosum"
2.4 Tidy’d Data
dat %>% datatable(filter = "bottom", style = "bootstrap5")
2.5 Stat’d Data
dat2 <- dat %>% # adding some stats
group_by(genus) %>% mutate(n_genus = n()) %>% ungroup() %>%
group_by(name_scientific) %>% mutate(n_species = n()) %>% ungroup() %>%
arrange(desc(n_genus), desc(n_species)) %>%
mutate(genus = as_factor(genus),
species = as_factor(species),
# genus_scaled = n_genus / max(n_genus) * 100,
# species_scaled = n_species / max(n_species) * 100,
perc_genus = n_genus / sum(n_genus) * 100,
perc_species = n_species / sum(n_species) * 100,
percent_genus = scales::percent(n_genus / sum(n_genus)),
percent_species = scales::percent(n_species / sum(n_species))) %>%
select(name_scientific, genus, species, contains("n_"), contains("perc"), everything())
dat3 <- dat2 %>% # creating a summary of species collected
group_by(genus) %>%
mutate(test = paste0(species)) %>%
summarise(genera_collected = toString(unique(species))) %>%
ungroup()
plant_dat <- left_join(x = dat2, y = dat3, by = c("genus")) %>% select(1,2,3,genera_collected,everything())
plant_dat %>%
select(genus, name_scientific, name_common, genera_collected,
contains("n_"), contains("percent_")) %>%
datatable(filter = "bottom", style = "bootstrap5")
3 Exploration
3.1 Basic Overview
We see here that I have 73 plants in my collection which represents 22 unique genera and 49 unique species.
plant_dat %>%
summarise(
n = n(),
n_distinct_genus = n_distinct(genus),
n_distinct_species = n_distinct(name_scientific)
) %>%
kbl(align = "l", format.args = list(big.mark = ",")) %>%
kable_styling(
full_width = F,
bootstrap_options = c("hover", "responsive", "striped"))
| n | n_distinct_genus | n_distinct_species |
|---|---|---|
| 73 | 22 | 49 |
3.2 Genera
3.2.1 Summary
viz_dat_genus <-
plant_dat %>%
group_by(genus, genera_collected) %>% summarise(n = n()) %>% ungroup() %>%
mutate(percent_total = scales::percent(n/sum(n)))
## `summarise()` has grouped output by 'genus'. You can override using the
## `.groups` argument.
viz_dat_genus %>% kbl(align = "l", format.args = list(big.mark = ",")) %>%
kable_styling(
full_width = F,
bootstrap_options = c("hover", "responsive", "striped"))
| genus | genera_collected | n | percent_total |
|---|---|---|---|
| Ficus | Lyrata, Elastica ‘Ruby’, Benghalensis, Elastica ‘Tineke’, Elastica ‘Burgundy’ | 7 | 9.6% |
| Musa | Acuminata | 1 | 1.4% |
| Alocasia | Odora Variegata, ‘Regal Shields’ | 3 | 4.1% |
| Pilea | Peperomiodes, Cadieri | 3 | 4.1% |
| Maranta | Leuconeura ‘Kerchoviana’, Leuconeura ‘Red’, Leuconeura ‘Lemon Lime’ | 3 | 4.1% |
| Anthurium | Clarinervium, Crystallinum, Magnificum, Regale, Dorayaki, Regale+Magnificum | 12 | 16.4% |
| Monstera | Deliciosa ‘Albo Variegata’, Deliciosa, Adansonii | 5 | 6.8% |
| Begonia | Maculata, ‘Lois Burke’, Fibrous ‘My Special Angel’, Fibrous ‘Whimsy’ | 5 | 6.8% |
| Epiprenem | Aurem | 1 | 1.4% |
| Peperomia | Prostrata, Obtusifolia | 4 | 5.5% |
| Philodendron | Verrucosum ‘Mini’, ‘El Choco’, Erubescens, Micans, Verrucosum X Melanochrysum, Verrucosum | 6 | 8.2% |
| Saintpaulia | Incognita | 1 | 1.4% |
| Scindapsus | Pictus ‘Argyraeus’, Pictus ‘Silver Lady’ | 2 | 2.7% |
| Ludisia | Discolor | 2 | 2.7% |
| Ceropegia | Woodii Variegata, Woodii | 4 | 5.5% |
| Rhaphidophora | Tetrasperma | 1 | 1.4% |
| Macodes | Petola | 4 | 5.5% |
| Calathea | Makoyana, Ornata | 2 | 2.7% |
| Oxalis | Regnelli | 1 | 1.4% |
| Zamioculcas | Zamiifoli | 4 | 5.5% |
| Aeschynanthus | Longicaulis | 1 | 1.4% |
| Adenium | ‘Swan And Dragon’ | 1 | 1.4% |
3.2.2 Viz
viz_genus <-
viz_dat_genus %>%
ggplot(aes(text = str_glue(
"Genera: {genus},
Count: {n},
Percent of Collection: {percent_total}
Species Collected: {genera_collected}"))) +
geom_col(aes(x = genus %>% fct_reorder(n), fill = n,y = n)) +
scale_y_continuous(breaks = seq(0, 12.5, by = 3)) +
coord_flip() +
scale_fill_viridis_c(option = "plasma", direction = -1) +
theme_ipsum() +
labs(
title = "Favorite Genera",
subtitle = "Can you tell I like Anthuriums?",
caption = "Out of the 22 Genera represented, my top 5 represent 47.5% of my collection",
x = "Genera",
y = "Count",
fill = "Count")
viz_genus
3.2.3 Interactive Viz
viz_genus %>% plotly::ggplotly(tooltip = "text")
3.3 Genus Species
Next, I’m going to visualize the species found with the various genera.
3.3.1 Top 5 Genera
To start, I’m honing in on the top 5 genera within my collection. The generan selected are: Anthurium, Ficus, Monstera, Begonia, & Philodendron.
I call this visualization Life in a Lock as a saw a parallel between the visualization and that of a keyhole. “Life” stemming from how the visualization depicts biodiversity within my collection.
# library
library(tidyverse)
# Create dataset
data <- plant_dat %>% filter(n_genus >= 5) %>%
group_by(genus, name_scientific, species) %>%
summarise(n = n()) %>% ungroup()
## `summarise()` has grouped output by 'genus', 'name_scientific'. You can override
## using the `.groups` argument.
data$genus <- factor(data$genus,
levels = c("Anthurium",
"Ficus",
"Monstera",
"Begonia",
"Philodendron"))
data$species <- factor(data$species,
levels = c("Clarinervium","Crystallinum",
"Magnificum","Regale",
"Dorayaki","Regale+Magnificum",
"Lyrata","Elastica 'Ruby'" ,
"Benghalensis","Elastica 'Tineke'",
"Elastica 'Burgundy'",
"Deliciosa 'Albo Variegata'",
"Deliciosa",
"Adansonii","Maculata",
"'Lois Burke'","Fibrous 'My Special Angel'",
"Fibrous 'Whimsy'","Verrucosum 'Mini'",
"'El Choco'","Erubescens",
"Micans","Verrucosum X Melanochrysum",
"Verrucosum"))
data <- data %>%
mutate(group = genus,
individual = species,
value = n) %>%
select(group, individual, value)
# Set a number of 'empty bar' to add at the end of each group
empty_bar <- 4
to_add <- data.frame( matrix(NA, empty_bar*nlevels(data$group), ncol(data)) )
colnames(to_add) <- colnames(data)
to_add$group <- rep(levels(data$group), each=empty_bar)
data <- rbind(data, to_add)
data <- data %>% arrange(group)
data$id <- seq(1, nrow(data))
#
# label_data <- data
# number_of_bar <- nrow(label_data)
# label_data$angle <- 360 * (label_data$id-0.5)/number_of_bar # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0)
# # label_data$hjust <- ifelse(angle < -90, 1, 0)
# # label_data$angle <- ifelse(angle < -90, angle+180, angle)
# Make the plot
p <- ggplot(data, aes(x=individual, y=value, fill = group)) +
geom_bar(stat="identity") +
ylim(-1,5) +
scale_fill_viridis_d(option = "plasma", direction = 1) +
theme_ipsum() +
labs(
title = "Life in a Lock",
subtitle = "Biodiversity amongst my top Genera",
caption = "Out of the 22 Genera represented, my top 5 represent 47.5% of my collection",
x = "",
y = "Count",
fill = "Genera") +
theme(
legend.position = "bottom",
# axis.text = element_blank(),
# axis.text.x = element_text(angle = 90),
# axis.title = element_blank(),
# panel.grid = element_blank(),
plot.margin = margin(.5, .5, .5, .5, "cm")
) +
coord_polar(start = 3.05, clip = "off")
p
## Warning: Removed 20 rows containing missing values (position_stack).